#Alexander F. Gazmararian
#afg2@princeton.edu
#January 9, 2024

#Load packages
library(tidyverse)
library(survey)
library(tidycensus)
library(here)
library(kableExtra)

#load survey data
g <- readRDS(here("data", "inter", "fairsurveys", "fairsurvey_2021.rds"))

#load census data----
acs.vars <- tidycensus::load_variables(2018, "acs5")
#sex by age by education
SexAgeEdu <- subset(acs.vars, concept == "SEX BY AGE BY EDUCATIONAL ATTAINMENT FOR THE POPULATION 18 YEARS AND OVER")
SexAgeEdu <- SexAgeEdu[stringr::str_count(SexAgeEdu$label, "!") == 8, ]
#household income
income <- subset(acs.vars, concept == "HOUSEHOLD INCOME IN THE PAST 12 MONTHS (IN 2018 INFLATION-ADJUSTED DOLLARS)")
income <- income[-1,]
#sex by age by white
RaceAgeSex <- subset(acs.vars, concept %in% c("SEX BY AGE (WHITE ALONE)", "SEX BY AGE (BLACK OR AFRICAN AMERICAN ALONE)",
                                              "SEX BY AGE (AMERICAN INDIAN AND ALASKA NATIVE ALONE)", "SEX BY AGE (ASIAN ALONE)",
                                              "SEX BY AGE (NATIVE HAWAIIAN AND OTHER PACIFIC ISLANDER ALONE)", "SEX BY AGE (SOME OTHER RACE ALONE)",
                                              "SEX BY AGE (TWO OR MORE RACES)"))
RaceAgeSex <- RaceAgeSex[stringr::str_count(RaceAgeSex$label, "!")==6,]
##B01003_001==total population
#"B25100_002", "B25100_008"
getvars <- c("B02001_002", "B01003_001", SexAgeEdu$name, income$name, RaceAgeSex$name)
#download census data
acs_long <- tidycensus::get_acs("county", county = "Greene County", state = "PA", variables = getvars, year = 2018, sumfile = "acs5")
#joint distribution of sex,age,and education
acs.SexAgeEdu <- subset(acs_long, variable %in% SexAgeEdu$name)
acs.SexAgeEdu <- merge(acs.SexAgeEdu, SexAgeEdu, by.x = "variable", by.y = "name")
joint <-
  acs.SexAgeEdu %>%
  tidyr::separate(col = label, into = c("estimate2", "total", "gender", "age_acs", "edu"), sep = "!!")
joint <- subset(joint, select = -c(concept, estimate2, total, geography, NAME, GEOID))
joint <-
  joint %>%
  dplyr::mutate(
    edu_acs = dplyr::case_when(
      edu %in% c("Less than 9th grade", "9th to 12th grade, no diploma", "High school graduate (includes equivalency)") ~ "No college",
      edu %in% c("Associate's degree", "Some college, no degree", "Bachelor's degree", "Graduate or professional degree") ~ "College"
    )
  )
joint <-
  joint %>%
  dplyr::group_by(gender, age_acs, edu_acs) %>%
  dplyr::summarise(estimate = sum(estimate))
joint$Freq <- joint$estimate / sum(joint$estimate)
joint$estimate <- NULL
#joint distribution of race, age, and sex
totalpop <- subset(acs_long, variable == "B01003_001")
acs.RaceAgeSex <- subset(acs_long, variable %in% RaceAgeSex$name)
acs.RaceAgeSex <- merge(acs.RaceAgeSex, RaceAgeSex, by.x = "variable", by.y = "name")
race.dist <-
  acs.RaceAgeSex %>%
  tidyr::separate(col = label, into = c("estimate2", "total", "gender", "age_acs"), sep = "!!") %>%
  dplyr::select(-c(variable, GEOID, NAME, moe, estimate2, total, geography))
race.dist$concept <- ifelse(race.dist$concept == "SEX BY AGE (WHITE ALONE)", "white", "nonwhite")
names(race.dist)[4] <- "race_acs"
##adjust age
race.dist <-
  race.dist %>%
  dplyr::mutate(
    age_acs = dplyr::case_when(
      age_acs %in% c("18 and 19 years", "20 to 24 years") ~ "18 to 24 years",
      age_acs %in% c("25 to 29 years", "30 to 34 years") ~ "25 to 34 years",
      age_acs %in% c("35 to 44 years") ~ "35 to 44 years",
      age_acs %in% c("45 to 54 years", "55 to 64 years") ~ "45 to 64 years",
      age_acs %in% c("65 to 74 years", "75 to 84 years", "85 years and over") ~ "65 years and over",
      T ~ NA_character_
    )
  )
race.dist <- subset(race.dist, !is.na(age_acs))
race.dist <-
  race.dist %>%
  dplyr::group_by(gender, age_acs, race_acs) %>%
  dplyr::summarise(estimate = sum(estimate))
race.dist$Freq <- race.dist$estimate / sum(race.dist$estimate)
#race.dist$Freq <- race.dist$Freq * nrow(d)
race.dist$estimate <- NULL
#prepare income distribution
acs.income <- subset(acs_long, variable %in% income$name)
acs.income <- merge(acs.income, income, by.x = "variable", by.y = "name")
income.dist <-
  acs.income %>%
  tidyr::separate(col = label, into = c("estimate2", "total", "income"), sep = "!!")
income.dist <- subset(income.dist, select = -c(concept, estimate2, total, geography, NAME, GEOID))
income.dist <-
  income.dist %>%
  dplyr::mutate(
    income_acs = dplyr::case_when(
      income %in% c("Less than $10,000", "$10,000 to $14,999", "$15,000 to $19,999") ~ "Less than $20,000",
      income %in% c("$20,000 to $24,999", "$25,000 to $29,999", "$30,000 to $34,999", "$35,000 to $39,999") ~ "$20,000 to $39,999",
      income %in% c("$40,000 to $44,999", "$45,000 to $49,999", "$50,000 to $59,999") ~ "$40,000 to $59,999",
      income %in% c("$60,000 to $74,999", "$75,000 to $99,999") ~ "$60,000 - $99,999",
      income %in% c("$100,000 to $124,999", "$125,000 to $149,999", "$150,000 to $199,999", "$200,000 or more") ~ "$100,000 or more",
      T ~ NA_character_
    )
  )
income.dist <-
  income.dist %>%
  dplyr::group_by(income_acs) %>%
  dplyr::summarise(estimate = sum(estimate))
income.dist$Freq <- income.dist$estimate / sum(income.dist$estimate)
income.dist$estimate <- NULL
#prepare survey income distribution
g <-
  g %>%
  dplyr::mutate(
    income_acs = dplyr::case_when(
      income %in% c("$60,000 to $79,999", "$80,000 to $99,999") ~ "$60,000 - $99,999",
      income %in% c("$100,000 to $119,999", "$120,000 to $139,999", "$140,000 to $159,999", "$160,000 or more") ~ "$100,000 or more",
      T ~ as.character(income)
    )
  )
#impute missing data with most common category
g %>%
  dplyr::group_by(income_acs) %>%
  dplyr::summarise(n = n()) %>%
  dplyr::arrange(desc(n))
g$income_acs <- ifelse(is.na(g$income_acs), "$60,000 - $99,999", g$income_acs)

#prep survey data for age variable
#prep race variable
g$race_acs <- ifelse(g$race == "White/Caucasian", "white", "nonwhite")
#prep gender variable
g$gender <- ifelse(g$sex == 1, "Female", "Male")
#impute missing gender with most common gender
g$gender <- ifelse(is.na(g$gender),  "Female", g$gender)

#prep highest ed variable
g$edu_acs <- with(g, ifelse(edu %in% c("Bachelor's Degree (for example; BA, BBA, BS)",
                                       "Bachelor's Degree (for example: BA, BBA, BS)", "Doctorate (for example: PhD, EdD)",
                                       "Master's Degree (for example: MA, MS, MEng)", "Professional Degree (for example: MD, DDS, JD)"), "College", "No college"))
table(g$edu_acs)
g <-
  g %>%
  dplyr::mutate(
    age_acs = dplyr::case_when(
      age == "18-24" ~ "18 to 24 years",
      age == "25-34" ~ "25 to 34 years",
      age == "35-44" ~ "35 to 44 years",
      age %in% c("45-54", "55-64") ~ "45 to 64 years",
      age == "65 or older" ~ "65 years and over"
    )
  )
#check joint distribution in sample----
#gender x age x education
d <-
  g %>%
  dplyr::group_by(gender, age_acs, edu_acs) %>%
  dplyr::summarize(
    n = n(),
    n_weight = sum(weights)
  ) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(
    prop = n / sum(n),
    prop_weight = n_weight / sum(n_weight)
  ) %>%
  dplyr::left_join(., joint, by = c("gender", "age_acs", "edu_acs")) %>%
  dplyr::mutate(
    age_acs = dplyr::case_when(
      age_acs %in% c("18 to 24 years", "25 to 34 years") ~ "18-34 years",
      age_acs %in% c("35 to 44 years", "45 to 64 years") ~ "35-64 years",
      age_acs == "65 years and over" ~ "$>$65 years"
    )
  ) %>%
  dplyr::group_by(gender, age_acs, edu_acs) %>%
  dplyr::summarise(across(n:Freq, ~ sum(.x)))

d$var <- with(d, paste(gender, age_acs, edu_acs, sep = " $\\times$ "))
d <- d[, c(9, 6, 7, 8)]
d <- d[c(3:6, 1, 2, 9:12, 7,8), ]

#income
income.sample <-
  g %>%
  dplyr::group_by(income_acs) %>%
  dplyr::summarize(
    n = n(),
    n_weight = sum(weights)
  ) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(
    prop = n / sum(n),
    prop_weight = n_weight / sum(n_weight)
  ) %>%
  dplyr::left_join(., income.dist, by = "income_acs")
names(income.sample)[1] <- "var"
income.sample <- income.sample[c(5, 2, 3, 4, 1), c(1, 4, 5, 6)]
income.sample$var <- c("$<$\\$20,000", "\\$20,000-39,999", "\\$40,000-59,999", "\\$60,000-99,999", "$>$\\$100,000")
#race
prop.white <- with(g, mean(white))
prop.white.w <- with(g, mean(white * weights))
pop.white <- race.dist %>%
  dplyr::group_by(race_acs) %>%
  dplyr::summarise(Freq = sum(Freq)) %>%
  slice(2)
r <- data.frame(var = "White", prop = prop.white, prop_weight = prop.white.w, Freq = pop.white$Freq)
#bind together
sample.dist <- rbind(d, income.sample, r)
names(sample.dist) <- c("", "Sample", "Weighted", "Population")
sample.dist <- sample.dist[, c(1, 2, 4, 3)]
tabout <- here("output", "tables", "si_tab_fair2021.txt")
kableExtra::kbl(sample.dist, booktabs = TRUE, digits = 2, format = "latex", escape = FALSE, linesep = "", align = "lccc",
                caption = "Representiveness of unweighted and weighted sample \\label{tab:sample_desc}",
                position = "t") %>%
  kableExtra::kable_styling(position = "center", font = 10.25) %>%
  kableExtra::pack_rows(escape = FALSE, "Sex/Age/Education", 1, 12) %>%
  kableExtra::pack_rows("Income", 13, 17) %>%
  kableExtra::pack_rows("Race", 18, 18) %>%
  cat(., file = tabout)
fair21 <- readLines(tabout, warn = FALSE)
fair21 <- fair21[-c(1,2,3,4,5,36)]
cat(fair21, file = tabout)
